---
title: "Script"
author: "Maria Knapczyk"
format:
html:
toc: true
toc-location: right
toc-title: Spis Treści
number-sections: true
embed-resources: true
html-math-method: katex
code-tools: true
code-fold: show
code-summary: "Show and hide code"
link-external-icon: true
link-external-newwindow: true
smooth-scroll: true
self-contained: true
fig-align: center
execute:
echo: true
error: false
warning: false
output: true
---
## Intro
```{r}
library (tidymodels)
library (tidyverse)
library (stringr)
library (vip)
library (pdp)
library (DALEX)
library (DALEXtra)
library (bestNormalize)
library (rules)
library (baguette)
library (finetune)
library (doParallel)
library (DT)
library (ggplot2)
library (lubridate)
library (future)
tidymodels_prefer ()
```
```{r}
data = read.csv ("coffee_shop_sales.csv" )
datatable (data)
```
## Data
```{r}
str (data)
```
```{r}
data <- data |>
as_tibble () |>
janitor:: clean_names () |>
mutate (
transaction_date = as.Date (transaction_date),
transaction_datetime = as.POSIXct (
paste (transaction_date, transaction_time),
format= "%Y-%m-%d %H:%M:%S"
)
) |>
mutate (
hour = hour (transaction_datetime)
) |>
mutate (
wday = wday (transaction_date, label = TRUE , abbr = TRUE ),
day_work = if_else (wday %in% c ("Sat" , "Sun" ), "weekend" , "week" ),
day_work = factor (day_work)
)
```
```{r}
data |> summary ()
```
```{r}
colSums (is.na (data))
```
```{r}
cor (data[sapply (data, is.numeric)])
```
## Plots
```{r}
ggplot (data, aes (x= total_price)) +
labs (
title = "Histogram cen" ,
x = "Cena" ,
y = "Ilość"
) +
geom_histogram (binwidth = 1 , fill = "cadetblue" , color = "black" )
ggplot (data, aes (x= loyalty_points_earned)) +
labs (
title = "Histogram zdobytych punktów lojanościowych" ,
x = "Przedziały" ,
y = "Ilość"
) +
geom_histogram (binwidth = 1 , fill = "cadetblue" , color = "black" )
ggplot (data, aes (x = product_category)) +
geom_bar (fill = "cadetblue" , color = "black" ) +
labs (
title = "Liczba transakcji w poszczególnych kategoriach produktów" ,
x = "Kategoria produktu" ,
y = "Liczba transakcji"
)
ggplot (data, aes (x= payment_method)) + geom_bar (fill = "cadetblue" , color = "black" ) +
labs (
title = "Liczba transakcji poszczególnych metod płatniczych" ,
x = "Metoda płatności" ,
y = "Liczba transakcji"
)
ggplot (data, aes (x= location)) + geom_bar (fill = "cadetblue" , color = "black" ) +
labs (
title = "Liczba transakcji w poszczególnych lokalizacjach" ,
x = "Lokalizacja" ,
y = "Liczba transakcji"
)
```
```{r}
hourly_products <- data %>%
group_by (hour, product_name) %>%
summarise (transactions = n (), .groups = "drop" ) %>%
arrange (hour, desc (transactions)) %>%
group_by (hour) %>%
slice_max (transactions, n = 3 )
ggplot (hourly_products, aes (x = factor (hour), y = transactions, fill = product_name)) +
geom_col () +
labs (
title = "Najczęściej kupowany produkt w każdej godzinie" ,
x = "Godzina" ,
y = "Liczba transakcji" ,
fill = "Produkt"
)
```
## Formula
```{r}
forms <- formula (total_price ~ .)
```
## Data split
```{r}
set.seed (111 )
trans_split <- initial_validation_split (data = data, strata = total_price)
trans_train <- training (trans_split)
trans_test <- testing (trans_split)
trans_valid <- validation_set (trans_split)
trans_folds <- vfold_cv (trans_train, strata = total_price, v = 10 , repeats = 5 ) # testowo
save (trans_split, trans_train, trans_test, trans_valid, trans_folds,file = "coffesplitted_data.Rdata" )
```
```{r}
load ("coffesplitted_data.Rdata" )
```
## Model
```{r}
# CART
cart_spec <- decision_tree (
cost_complexity = tune (),
min_n = tune ()
) %>%
set_engine ("rpart" ) %>%
set_mode ("regression" )
# Random Forest
rf_spec <- rand_forest (
mtry = tune (),
min_n = tune (),
trees = tune ()
) %>%
set_engine ("ranger" ) %>%
set_mode ("regression" )
# Cubist
cubist_spec <- cubist_rules (
committees = tune (),
neighbors = tune ()
) %>%
set_engine ("Cubist" ) %>%
set_mode ("regression" )
# XGBoost
xgb_spec <- boost_tree (
tree_depth = tune (),
learn_rate = tune (),
loss_reduction = tune (),
min_n = tune (),
sample_size = tune (),
trees = tune ()
) %>%
set_engine ("xgboost" ) %>%
set_mode ("regression" )
# MARS
mars_spec <- mars (
num_terms = tune (),
prod_degree = tune ()
) %>%
set_engine ("earth" ) %>%
set_mode ("regression" )
# knn
knn_spec <- nearest_neighbor (
neighbors = tune ()
) %>%
set_engine ("kknn" ) %>%
set_mode ("regression" )
```
## Recipe
```{r}
basic_rec <- recipe (total_price ~ ., data = trans_train) %>%
step_mutate (hour = as.numeric (hour)) %>%
update_role (
transaction_id, transaction_date, transaction_time,
product_id, customer_id, barista_id,
new_role = "ID"
) %>%
step_rm (quantity, unit_price, loyalty_points_earned)
basic_t_rec <- recipe (total_price ~ ., data = trans_train) %>%
update_role (transaction_id, transaction_date, transaction_time,
product_id, customer_id, barista_id, new_role = "ID" ) %>%
step_mutate (
transaction_datetime = as.numeric (transaction_datetime),
hour = as.numeric (hour)
) %>%
step_rm (quantity, unit_price, loyalty_points_earned) %>%
step_unknown (all_nominal_predictors ()) %>%
step_dummy (all_nominal_predictors ()) %>%
step_zv (all_nominal_predictors ())
cubist_rec <- recipe (total_price ~ ., data = trans_train) %>%
update_role (transaction_id, transaction_date, transaction_time,
product_id, customer_id, barista_id, new_role = "ID" ) %>%
step_mutate (
transaction_datetime = as.numeric (transaction_datetime),
hour = as.numeric (hour)
) %>%
step_rm (quantity, unit_price, loyalty_points_earned) %>%
step_unknown (all_nominal_predictors ()) %>%
step_novel (all_nominal_predictors ()) %>%
step_string2factor (all_nominal_predictors ()) %>%
step_zv (all_predictors ())
prep (basic_t_rec, training = trans_train) %>% juice ()
```
```{r}
prep (basic_rec, training = trans_train) |> juice ()
```
```{r}
summary (basic_rec) |> knitr:: kable ()
```
## Workflow
```{r}
a <- workflow_set (
preproc = list (b = basic_rec),
models = list (
rpart = cart_spec,
ranger = rf_spec
)
)
b <- workflow_set (
preproc = list (t = basic_t_rec),
models = list (xgboost = xgb_spec
)
)
c <- workflow_set (
preproc = list (cubist = cubist_rec),
models = list (cubist = cubist_spec)
)
d <- workflow_set (
preproc = list (transformed = basic_rec),
models = list (
mars = mars_spec,
knn = knn_spec
)
)
basic <- bind_rows (a, b, c, d)
basic$ wflow_id <- str_sub (basic$ wflow_id, start = 3 , end = 100 )
basic
```
## Parameters
```{r}
#n nrow i p predyktory
cart_param <- cart_spec |>
extract_parameter_set_dials () |>
update (
min_n = min_n (c (5 , 30 )) # <1% z n
)
rf_param <- rf_spec |>
extract_parameter_set_dials () |>
update (
min_n = min_n (c (5 , 30 )),
mtry = mtry (c (4 , 9 )), # sqrt i polowa p
trees = trees (c (100 , 500 ))
)
xgb_param <- xgb_spec |>
extract_parameter_set_dials () |>
update (
min_n = min_n (c (5 , 30 )),
trees = trees (c (100 , 500 )),
tree_depth = tree_depth (c (2 ,10 )),
learn_rate = learn_rate (c (0.01 , 0.3 )),
loss_reduction = loss_reduction (c (0 , 5 )),
sample_size = sample_prop (c (0.5 , 1 ))
)
cubist_param <- cubist_spec |>
extract_parameter_set_dials () |>
update (
committees = committees (c (1 , 10 )),
neighbors = neighbors (c (0 , 5 ))
)
mars_param <- mars_spec |>
extract_parameter_set_dials () |>
update (
num_terms = num_terms (c (2 , 30 )),
prod_degree = prod_degree (c (1 , 2 ))
)
knn_param <- knn_spec |>
extract_parameter_set_dials () |>
update (
neighbors = neighbors (c (3 , 15 ))
)
basic <- basic |> option_add (param_info = cart_param, id = "rpart" )
basic <- basic |> option_add (param_info = rf_param, id = "ranger" )
basic <- basic |> option_add (param_info = xgb_param, id = "xgboost" )
basic <- basic |> option_add (param_info = cubist_param, id = "cubist" )
basic <- basic |> option_add (param_info = mars_param, id = "mars" )
basic <- basic |> option_add (param_info = knn_param, id = "knn" )
basic
```
```{r}
basic |>
split (~ wflow_id) |>
map (
\(x) extract_parameter_set_dials (
x = x,
id = x$ wflow_id
) |>
_$ object
)
```
## Grid and tune
```{r}
race_ctrl <- control_race (
save_pred = TRUE ,
parallel_over = "everything" ,
save_workflow = FALSE ,
verbose = TRUE
)
```
```{r}
cores <- parallel:: detectCores (logical = FALSE )
cl <- makePSOCKcluster (cores)
registerDoParallel (cl)
race_models <- basic |>
filter (wflow_id %in% c ("rpart" , "ranger" , "bist_cubist" , "xgboost" ))
time_race <- Sys.time ()
race_result <- workflow_map (
race_models,
"tune_race_anova" ,
seed = 111 ,
resamples = trans_folds,
grid = 50 ,
control = race_ctrl,
verbose = TRUE ,
metrics = metric_set (rmse, mae, rsq)
)
Sys.time () - time_race
```
```{r}
grid_ctrl <- control_grid (
save_pred = TRUE ,
parallel_over = "everything"
)
# workflow_set już masz
grid_models <- workflow_set (
preproc = list (transformed = basic_t_rec),
models = list (
mars = mars_spec,
knn = knn_spec
)
)
# strojenie wszystkich workflowów naraz
grid_result <- workflow_map (
grid_models,
"tune_grid" ,
resamples = trans_folds,
grid = 20 ,
metrics = metric_set (rmse, mae, rsq),
control = grid_ctrl
)
stopCluster (cl)
save (race_result, grid_result, file = "tune_results_split.Rdata" )
```
## Best model selection
```{r}
load ("tune_results_split.Rdata" )
combined_results <- bind_rows (race_result, grid_result)
best_results <-
combined_results |>
split (~ wflow_id) |>
map (
\(x)
extract_workflow_set_result (x = x, id = x$ wflow_id) |>
select_best (metric = "rmse" , )
)
best_models <-
combined_results |>
split (~ wflow_id) |>
map (
\(x)
extract_workflow (x = x, id = x$ wflow_id) |>
finalize_workflow (best_results[[x$ wflow_id]]) |>
last_fit (
split = trans_split,
metrics = metric_set (rmse, rsq, mae)
)
)
save (best_models, file = "best_models.rdata" )
```
```{r}
load ("tune_result.Rdata" )
load ("best_models.rdata" )
```
## Result tune
```{r}
combined_results |>
split (~ wflow_id) |>
map (
\(x)
extract_workflow_set_result (x = x, id = x$ wflow_id) |>
show_best (metric = "rmse" , n = 1 ) |>
select (- n, - .metric, - .config)
) |>
knitr:: kable ()
```
```{r}
combined_results |>
rank_results (select_best = T) |>
unite ("rate" , c ("mean" , "std_err" ), sep = "/" ) |>
pivot_wider (names_from = .metric, values_from = rate) |>
separate_wider_delim (
cols = mae: rsq,
delim = "/" ,
names = c ("" , "_std_err" ),
names_sep = "" ) |>
select (- preprocessor, - n, - model) |>
mutate (.config = str_sub (.config, 20 , 30 )) |>
mutate (across (
.cols = mae: rsq_std_err,
.fns = \(x) signif (x = as.numeric (x), digits = 3 )
)) |> arrange (mae) |>
gt:: gt () |>
gt:: tab_header (title = "Wyniki oceny dla zestawu walidacyjnego" )
```
```{r}
combined_results |>
rank_results (select_best = T) |>
mutate (wflow_id = fct_reorder (wflow_id, rank)) |>
ggplot (aes (wflow_id, mean, colour = wflow_id)) +
geom_point () +
geom_errorbar (aes (ymin = mean - 1.96 * std_err,
ymax = mean + 1.96 * std_err), width = 0.8 ) +
facet_wrap (~ .metric, scales = "free_y" ) +
theme_bw () +
labs (x = "model" ,
y = "value metric" ,
colour = "model" ) +
theme (axis.text.x = element_text (angle = 45 , hjust = 1 ))
```
```{r}
ggsave (filename = "rys. 3._valid_result_model.jpeg" , device = "jpeg" ,
width = 8.5 , height = 3 , dpi = 300 )
```
```{r}
all_fit_metrics <-
combined_results |>
split (~ wflow_id) |>
map (
\(x)
extract_workflow_set_result (x = x, id = x$ wflow_id) |>
_$ .metrics |>
_[[1 ]] |>
mutate (.config = str_sub (.config, 20 , 24 ))
)
```
### Ranger
```{r}
c ("mae" , "rmse" , "rsq" ) |>
map_dfr (
\(x)
combined_results |>
extract_workflow_set_result (id = "ranger" ) |>
select_best (metric = x), .id = ".metric" ) |>
gt:: gt ()
```
```{r}
all_fit_metrics[["ranger" ]] |> # Zmień model ...
filter (.metric == "rmse" ) |> # Zmień statystykę rsq, mae
ggplot (aes (trees, .estimate, color = factor (min_n))) +
geom_point () +
facet_wrap (~ mtry) +
scale_x_continuous (limits = c (0 , 600 ), breaks = seq (0 , 600 , 100 )) +
ggtitle (label = "ranger" ) +
theme (axis.text.x = element_text (angle = 45 , hjust = 1 ))
```
Najlepszy mtry = 4
300/400 drzew jest wystarczjaące
Najniższe RMSE dla sporej liczby min_n - koło 25, drzewa płytsze i zgeneralizowane
```{r}
all_fit_metrics[["ranger" ]] |>
filter (.metric == "rmse" , .estimate < 2.74 ) |>
arrange (.estimate) |>
gt:: gt ()
```
Najlepsze parametry z config 07
### Cubist
```{r}
c ("mae" , "rmse" , "rsq" ) |>
map_dfr (
\(x)
combined_results |>
extract_workflow_set_result (id = "bist_cubist" ) |>
select_best (metric = x), .id = ".metric" ) |>
gt:: gt ()
```
```{r}
all_fit_metrics[["bist_cubist" ]] |> # Zmień model ...
filter (.metric == "rmse" ) |> # Zmień statystykę rsq, mae
ggplot (aes (committees, .estimate, color = factor (neighbors))) +
geom_point (position = position_jitter (width = 0.5 , height = 0 )) +
facet_wrap (~ neighbors, scales = "free_y" ) +
scale_x_continuous (limits = c (0 , 120 ), breaks = seq (0 , 120 , 20 ), expand = c (0 , 0 )) +
ggtitle (label = "Cubist" ) +
theme (axis.text.x = element_text (angle = 45 , hjust = 1 ))
```
Najlepsze neighbors 0 i dla najwiekszych im zwiekszam tym sie rmse zmniejsza.
```{r}
all_fit_metrics[["bist_cubist" ]] |> # Zmień model ...
filter (.metric == "rmse" ) |> # Zmień statystykę rsq, mae
filter (between (neighbors, 7 , 9 )) |>
ggplot (aes (committees, .estimate, color = factor (neighbors))) +
geom_point () +
geom_line () +
scale_x_continuous (limits = c (0 , 120 ), breaks = seq (0 , 120 , 20 ), expand = c (0 , 0 )) +
ggtitle (label = "Cubist" )
```
```{r}
all_fit_metrics[["bist_cubist" ]] |>
filter (.metric == "rmse" , .estimate < 2.86 ) |>
arrange (.estimate) |>
gt:: gt ()
```
Dla małych neighbors dodatkowe komisje nie mają sensu, model jest już optymalny.
Dla większych neighbors dodanie komisji nieznacznie zmienia RMSE, efekt jest minimalny.
### Rpart
```{r}
c ("mae" , "rmse" , "rsq" ) |>
map_dfr (
\(x)
combined_results |>
extract_workflow_set_result (id = "rpart" ) |>
select_best (metric = x) |>
mutate (.metric = x)) |>
gt:: gt ()
```
```{r}
all_fit_metrics[["rpart" ]] |> # Zmień model ...
filter (.metric == "rmse" ) |> # Zmień statystykę rsq, mae
ggplot (aes (cost_complexity, .estimate, color = factor (min_n))) +
geom_point () +
geom_line () +
# facet_wrap(~min_n) +
scale_x_log10 (breaks = breaks_log (n = 10 , base = 10 ), labels = label_log (base = 10 )) +
labs (title = "rpart" , color = "min_n" , y = "rmse" )
```
```{r}
all_fit_metrics[["rpart" ]] |>
filter (.metric == "rmse" ) |>
slice_min (.estimate, n = 3 )
```
### xgboost
```{r}
c ("mae" , "rmse" , "rsq" ) |>
map_dfr (
\(x)
combined_results |>
extract_workflow_set_result (id = "xgboost" ) |>
select_best (metric = x) |>
mutate (.metric = x)) |>
gt:: gt ()
```
```{r}
all_fit_metrics[["xgboost" ]] |> # Zmień model ...
filter (.metric == "rmse" ) |> # Zmień statystykę rsq, mae
ggplot (aes (trees, .estimate, color = factor (min_n))) +
geom_point () +
facet_wrap (~ tree_depth, scales = "free_y" ) +
labs (title = "xgboost" , color = "min_n" , y = "rmse" )
```
rmse <2.8
```{r}
all_fit_metrics[["xgboost" ]] |> # Zmień model ...
filter (.metric == "rmse" , .estimate < 2.8 ) |> # Zmień statystykę rsq, mae
ggplot (aes (trees, .estimate, color = factor (min_n))) +
geom_point () +
facet_wrap (~ tree_depth) +
labs (title = "xgboost" , color = "min_n" , y = "rmse" )
```
min_n 9 oraz depth 5
### Transformed mars
```{r}
c ("mae" , "rmse" , "rsq" ) |>
map_dfr (
\(x)
combined_results |>
extract_workflow_set_result (id = "transformed_mars" ) |>
select_best (metric = x), .id = ".metric" ) |>
gt:: gt ()
```
```{r}
all_fit_metrics[["transformed_mars" ]] |>
filter (.metric == "rmse" ) |>
ggplot (aes (num_terms, .estimate, color = factor (prod_degree))) +
geom_point () +
geom_line () +
facet_wrap (~ prod_degree, scales = "free_y" ) +
ggtitle (label = "transformed_mars" ) +
theme (axis.text.x = element_text (angle = 45 , hjust = 1 ))
```
minimalne różnice
### knn
```{r}
c ("mae" , "rmse" , "rsq" ) |>
map_dfr (
\(x)
combined_results |>
extract_workflow_set_result (id = "transformed_knn" ) |>
select_best (metric = x), .id = ".metric" ) |>
gt:: gt ()
```
```{r}
colnames (all_fit_metrics[["transformed_knn" ]])
```
```{r}
all_fit_metrics[["transformed_knn" ]] |>
filter (.metric == "rmse" ) |>
ggplot (aes (neighbors, .estimate, color = factor (neighbors))) +
geom_point () +
geom_line (aes (group = 1 )) +
ggtitle (label = "transformed_knn" ) +
theme (axis.text.x = element_text (angle = 45 , hjust = 1 ))
```
RMSE maleje wraz ze wzrostem sasiadow
```{r}
all_fit_metrics[["transformed_knn" ]] |>
filter (.metric == "rmse" , .estimate < 2.87 ) |>
arrange (.estimate) |>
gt:: gt ()
```
Każda z opcji jest dobra - małe różnice